import numpy as np
import matplotlib.pyplot as plt # To visualize
import pandas as pd # To read data
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML
import seaborn as sndata = pd.read_stata('berkeley.dta', preserve_dtypes=False)data['gender'].describe()count 4526
unique 2
top Male
freq 2691
Name: gender, dtype: object
gender = {'Male': 0,'Female': 1}
data.gender = [gender[item] for item in data.gender]
admit = {'Rejected': 0,'Admitted': 1}
data.admit = [admit[item] for item in data.admit]
dept = {'A': 0, 'B': 1, 'C':2, 'D': 3, 'E': 4, 'F': 5}
data.dept = [dept[item] for item in data.dept]onlywomen = data.loc[data['gender'] == 1].copy()
onlywomen['admit'].describe()count 1835.000000
mean 0.303542
std 0.459913
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 1.000000
Name: admit, dtype: float64
data.agg(
{
"gender": ["min", "max", "median", "skew"],
"admit": ["min", "max", "median", "mean"],
}
)| gender | admit | |
|---|---|---|
| min | 0.000000 | 0.00000 |
| max | 1.000000 | 1.00000 |
| median | 0.000000 | 0.00000 |
| skew | 0.385339 | NaN |
| mean | NaN | 0.38776 |
data.dtypesapplicant int64
admit int64
gender int64
dept object
dtype: object
LR = LinearRegression() # create object for the classX = data['gender'].values.reshape(-1, 1) # values converts it into a numpy array
y = data['admit'].values.reshape(-1, 1) # values converts it into a numpy arrayLR.fit(X, y)LinearRegression()
print('Coefficients: \n', LR.coef_)Coefficients:
[[-0.14164543]]
X = sm.add_constant(X)model = sm.OLS(data['admit'], data['gender']).fit()model.summary()| Dep. Variable: | admit | R-squared (uncentered): | 0.096 |
| Model: | OLS | Adj. R-squared (uncentered): | 0.096 |
| Method: | Least Squares | F-statistic: | 482.4 |
| Date: | Sun, 10 Oct 2021 | Prob (F-statistic): | 1.11e-101 |
| Time: | 18:52:17 | Log-Likelihood: | -4049.0 |
| No. Observations: | 4526 | AIC: | 8100. |
| Df Residuals: | 4525 | BIC: | 8106. |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
| gender | 0.3035 | 0.014 | 21.964 | 0.000 | 0.276 | 0.331 |
| Omnibus: | 24363.738 | Durbin-Watson: | 0.015 |
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 574.437 |
| Skew: | 0.385 | Prob(JB): | 1.83e-125 |
| Kurtosis: | 1.434 | Cond. No. | 1.00 |
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
data['dept'].unique()array(['A', 'B', 'C', 'D', 'E', 'F'], dtype=object)
data.describe()| applicant | admit | gender | dept | |
|---|---|---|---|---|
| count | 4526.000000 | 4526.000000 | 4526.000000 | 4526.000000 |
| mean | 2263.500000 | 0.387760 | 0.405435 | 2.364781 |
| std | 1306.687989 | 0.487293 | 0.491030 | 1.712402 |
| min | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 1132.250000 | 0.000000 | 0.000000 | 1.000000 |
| 50% | 2263.500000 | 0.000000 | 0.000000 | 2.000000 |
| 75% | 3394.750000 | 1.000000 | 1.000000 | 4.000000 |
| max | 4526.000000 | 1.000000 | 1.000000 | 5.000000 |
Income_Gini = sm.OLS(data['Gini'], sm.add_constant(data['Income_Per_Capita'])).fit()